1 package edu.jiangxin.apktoolbox.pdf;
2
3 import com.itextpdf.kernel.pdf.PdfDocument;
4 import com.itextpdf.kernel.pdf.PdfReader;
5 import com.itextpdf.kernel.pdf.PdfWriter;
6 import org.apache.commons.io.IOUtils;
7 import org.apache.logging.log4j.LogManager;
8 import org.apache.logging.log4j.Logger;
9 import org.apache.pdfbox.Loader;
10 import org.apache.pdfbox.pdmodel.PDDocument;
11 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
12 import org.apache.pdfbox.pdmodel.PDPage;
13 import org.apache.pdfbox.pdmodel.PDPageTree;
14 import org.apache.pdfbox.text.PDFTextStripper;
15
16 import java.io.File;
17 import java.io.IOException;
18
19 public class PdfUtils {
20 private static final Logger LOGGER = LogManager.getLogger(PdfUtils.class.getSimpleName());
21 public static boolean isScannedPdf(File file, int threshold) {
22 int length = 0;
23
24 try (PDDocument document = Loader.loadPDF(file)) {
25 boolean isEncrypted = document.isEncrypted();
26 if (isEncrypted) {
27 document.setAllSecurityToBeRemoved(true);
28 }
29
30 PDFTextStripper stripper = new PDFTextStripper();
31 String text = stripper.getText(document).trim();
32 length = text.length();
33 } catch (IOException e) {
34 LOGGER.error("Error reading PDF file: {}", e.getMessage());
35 return false;
36 }
37 LOGGER.info("Processing file: {}, text size: {}", file.getPath(), length);
38 return length < threshold;
39 }
40
41 public static boolean isEncryptedPdf(File file) {
42 boolean isEncrypted;
43
44 try (PDDocument document = Loader.loadPDF(file)) {
45 isEncrypted = document.isEncrypted();
46 } catch (IOException e) {
47 LOGGER.error("Error reading PDF file: {}", e.getMessage());
48 return false;
49 }
50 LOGGER.info("Processing file: {}, is encrypted: {}", file.getPath(), isEncrypted);
51 return isEncrypted;
52 }
53
54 public static boolean isNonOutlinePdf(File file) {
55 boolean hasOutline = false;
56
57 try (PDDocument document = Loader.loadPDF(file)) {
58 boolean isEncrypted = document.isEncrypted();
59 if (isEncrypted) {
60 document.setAllSecurityToBeRemoved(true);
61 }
62
63 if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getDocumentOutline() != null) {
64 hasOutline = true;
65 }
66 } catch (IOException e) {
67 LOGGER.error("Error reading PDF file: {}", e.getMessage());
68 return false;
69 }
70 LOGGER.info("Processing file: {}, has outline: {}", file.getPath(), hasOutline);
71 return !hasOutline;
72 }
73
74 public static boolean hasAnnotations(File file) {
75 boolean hasAnnotations = false;
76
77 try (PDDocument document = Loader.loadPDF(file)) {
78 boolean isEncrypted = document.isEncrypted();
79 if (isEncrypted) {
80 document.setAllSecurityToBeRemoved(true);
81 }
82 PDDocumentCatalog catalog = document.getDocumentCatalog();
83 if (catalog == null) {
84 return false;
85 }
86 PDPageTree pages = document.getDocumentCatalog().getPages();
87 if (pages == null || pages.getCount() == 0) {
88 return false;
89 }
90
91 for (PDPage page : pages) {
92 if (page.getAnnotations() != null && !page.getAnnotations().isEmpty()) {
93 int pageNumber = page.getCOSObject().getInt("PageNumber", 0);
94 String subType = page.getAnnotations().get(0).getSubtype();
95 LOGGER.info("Found annotations on page: {}, subType: {}", pageNumber, subType);
96 if (!subType.equals("Link")) {
97 hasAnnotations = true;
98 break;
99 }
100 }
101 }
102 } catch (IOException e) {
103 LOGGER.error("Error reading PDF file: {}", e.getMessage());
104 return hasAnnotations;
105 }
106 LOGGER.info("Processing file: {}, has annotations: {}", file.getPath(), hasAnnotations);
107 return hasAnnotations;
108 }
109
110 public static void removePassword(File encryptedFile, File targetDir) {
111 try (PDDocument document = Loader.loadPDF(encryptedFile)) {
112 boolean isEncrypted = document.isEncrypted();
113 if (isEncrypted) {
114 document.setAllSecurityToBeRemoved(true);
115 }
116 String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
117 document.save(targetFilePath);
118 LOGGER.info("Remove password success: {}", targetFilePath);
119 } catch (IOException e) {
120 LOGGER.error("Error processing PDF file: {}", e.getMessage());
121 }
122 }
123
124 public static void removePasswordWithIText(File encryptedFile, File targetDir) {
125 String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
126 PdfReader reader = null;
127 PdfDocument pdfDoc = null;
128 try {
129 reader = new PdfReader(encryptedFile);
130 reader.setUnethicalReading(true);
131 PdfWriter writer = new PdfWriter(targetFilePath);
132 pdfDoc = new PdfDocument(reader, writer);
133 } catch (IOException e) {
134 LOGGER.error("Error processing PDF file: {}", e.getMessage());
135 } finally {
136 IOUtils.closeQuietly(pdfDoc);
137 IOUtils.closeQuietly(reader);
138 }
139 }
140
141 public static int getPageCount(File file) {
142 int pageCount = 0;
143
144 try (PDDocument document = Loader.loadPDF(file)) {
145 boolean isEncrypted = document.isEncrypted();
146 if (isEncrypted) {
147 document.setAllSecurityToBeRemoved(true);
148 }
149 pageCount = document.getNumberOfPages();
150 } catch (IOException e) {
151 LOGGER.error("Error reading PDF file: {}", e.getMessage());
152 return 0;
153 }
154 LOGGER.info("Processing file: {}, page count: {}", file.getPath(), pageCount);
155 return pageCount;
156 }
157 }